### Replication code for "Misdemeanor Disenfranchisement?"
### October 2018
### See the readme file for more details on what goes where in this replication package
### Contact Ariel White with questions: arwhi@mit.edu


## NB: this check relies on data from a different project (focusing on registered-as-of-2012 voters), so there's a different setup process.
## See my other paper using this approach for more details on the setup ("Family Matters? Voting Behavior in Households with Criminal Justice Contact")
## I've included (commented out here) the code used to generate the deidentified dataset included with this replication package. 

#rm(list=ls())

#library(data.table)
##library(sp) # Set CRS
##library(spdep) # Spatial statistics and definitions
##library(rgeos) # buffer and polygon analysis
##library(maptools)
##library(maps)

#load("Harrisdefendants20002012_Arcgeocoded.Rdata")
#harrismatch1 <- geodef12flat
#dim(harrismatch1)
#load("Harrisdefendants20122014_Arcgeocoded.Rdata")
#update <- geodef12flat[!(geodef12flat$fyear==2012),]; dim(update); dim(geodef12flat); #drop 2012, to avoid dupes.
#harrismatch <- rbind(harrismatch1, update); dim(harrismatch)

##trim down to Harris county addresses? using zipcode http://www.zillow.com/browse/homes/tx/harris-county/
#Harriscozips <- c(63362, 77002, 77004, 77003, 77006, 77005, 77008, 77007, 77010, 77009, 77012, 77011, 75032, 77014, 77013, 77016, 63383, 77015, 77018, 77017, 77020, 77019, 77022, 77021, 77024, 77023, 77026, 77025, 77028, 77027, 77030, 77029, 77032, 77031, 77034, 77033, 77036, 77035, 77038, 77037, 77040, 77039, 77042, 77041, 77044, 77043, 77046, 77045, 77048, 77047, 77050, 77049, 77051, 77054, 77053, 77056, 77055, 77058, 28056, 77057, 77060, 77059, 77062, 77061, 77064, 77063, 77066, 77065, 77068, 77067, 77070, 77069, 76048, 77072, 77071, 77074, 77073, 77076, 77075, 77078, 77077, 77080, 77079, 77082, 77081, 77084, 77083, 77086, 77085, 77088, 77087, 77090, 75134, 77089, 77092, 77091, 77094, 78108, 77093, 77096, 77095, 77098, 77099, 77204, 63627, 75160, 77217, 77238, 77249, 77255, 75229, 77266, 77268, 76226, 77306, 77318, 77316, 77325, 76247, 77334, 78266, 77336, 77339, 77338, 77345, 76270, 77346, 77354, 33935, 77356, 77355, 77357, 77362, 77365, 77373, 77375, 77377, 77379, 77381, 77380, 77383, 77382, 77385, 80498, 77384, 77386, 77389, 77388, 77391, 77396, 77401, 76437, 77406, 77410, 77423, 77429, 77433, 77441, 76472, 77445, 77447, 77450, 77449, 77459, 76513, 77469, 77471, 77478, 77477, 77479, 77482, 77484, 77489, 77493, 77494, 77503, 77502, 77505, 77504, 36862, 77507, 77506, 77510, 77514, 77521, 77520, 77530, 77532, 77531, 77523, 77407, 77536, 77498, 77535, 77539, 79708, 77546, 77545, 77547, 77554, 77562, 77571, 77573, 77578, 77581, 77584, 77583, 77586, 77587, 77590, 77598, 77650, 77663, 75758, 78669, 79938, 77845, 78734)
#dim(harrismatch)
#harrismatch <- harrismatch[harrismatch$ARC_ZIP %in% Harriscozips, ] 
#harrismatch <- harrismatch[harrismatch$Score >48,] 
#harrismatch$firstcase_date <-  as.Date(as.character(harrismatch$firstcased), format = "%Y%m%d")
#harrismatch <- harrismatch[harrismatch$firstcase_date > "2008-11-04" ,] 
#harrismatch <- harrismatch[harrismatch$crt<16,] # keep only misdemeanants 

##also pull in the parsed versions, since I'll need the names (see JMP code for setup)
#load("Harris0912_parseddefendants_withfuture.Rdata") #"defendants".
#defendantsold <- defendants
#load("Harris1214_parseddefendants_withfuture.Rdata") 
#defendantsold <- subset(defendantsold, select=c("def_spn", "DefLastName", "DefFirstName", "DefMiddleName", "crt")) #trim down 
#defendants <- subset(defendants, select=c("def_spn", "DefLastName", "DefFirstName", "DefMiddleName", "crt")) #trim down 
#defendants <- data.table(rbind(defendantsold, defendants))
#defendants <- defendants[defendants$crt<16,] 

##keep unique.
#setkey(defendants, "def_spn")
#defendants <- unique(defendants)
#dim(defendants)

##now merge these together-- only keep the ones that geocoded to Harris, not all the parsed defendants.
#dim(harrismatch); dim(defendants) #about 70k fewer.
#defendants_geo <- merge(defendants, harrismatch, by=c("def_spn"), all.y=T)
#dim(defendants_geo); dim(defendants); dim(harrismatch)
##now take this file and go merge it to the 2012 voter file to see who was registered. 
#defendants <- defendants_geo

#load("Harrisvoters2013_Arcgeocoded.Rdata")

#harrisproj <- fullfile[fullfile$Status=="M",]; dim(fullfile); dim(harrisproj) #as in main file, keep only geocoded voters.
#setnames(harrisproj, "Y",  "Latitude")
#setnames(harrisproj, "X", "Longitude")
#head(harrisproj)

##match defendants to voter file
##first just try exact matching on last, first names, DOB
#setnames(defendants, "def_yob", "voterYOB")
#defendants[, voterfname := as.character(DefFirstName)]
#defendants[, voterlname := as.character(DefLastName)]
#defendants[, voterfinitial := substr(voterfname, 1,1)]
#names(defendants)

#defendants[, casedate:= ymd(firstcased)]
#harrisproj[,registered :=1]

##label names here.
#setnames(harrisproj, "middle_name", "votermname")
#harrisproj[, voterfname := toupper(first_name)]
#harrisproj[, voterlname := toupper(last_name)]
#harrisproj[, voterfinitial := substr(voterfname, 1, 1)]
#library(lubridate)
#harrisproj[, voterDOB := ymd(born_at)]
#defendants[, voterDOB := ymd(def_dob)]

#voter1 <- merge(harrisproj, defendants, by=c("voterlname","voterfinitial", "voterDOB"), allow.cartesian=T, all.y=T)
#dim(voter1); dim(harrisproj); dim(defendants)
#length(unique(voter1$def_spn))
#sum(voter1$registered, na.rm=T)/nrow(voter1)

##trim down duplicates using firstname string match.
#require(stringdist)
#voter1$fnamematchdist <- mapply(stringdist, voter1$voterfname.x, voter1$voterfname.y, method="jw", p=0)
#summary(voter1$fnamematchdist) #runs 0-1, 0 is perfect match.
##note that most are perfect matches (due to birthdate)-- fuzzy first-name matching isn't doing that much work. 

#setkey(voter1, def_spn, fnamematchdist) #sort within birth records by match quality
##then drop any matches that are v. bad, and also keep only the best one of duplicated matches.
#voter1[, n:=1:.N, by=list(def_spn)]
#voter1.1<-voter1[n==1]
#dim(voter1.1); dim(voter1)
#length(unique(voter1.1$def_spn));(dim(voter1.1)) #no more dups.

## keep only the 2012 registrants, then go find them in the 2014 file to check on voting. 
#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, registered := 0] #drop out the bad matches
#voter1.2 <- voter1.1[registered==1,] ; dim(voter1.2)
#voter1 <- voter1.2

##now pull in the 2014 file and see who voted in 2014.
#load("fullTXfile2014_minimal.Rdata")
#dim(fullfiletrim) #this is the full TX file from 2014; I cut it down to just a couple of columns for this so it's not so huge to load in.
#fullfiletrim[, state_file_id := as.numeric(state_file_id)]

#small1 <- merge(voter1, fullfiletrim, by="state_file_id"); dim(small1); dim(voter1); dim(fullfiletrim) #so 2k get dropped by 2014 
#going to keep everyone from the earlier file and count the unmatched as 2012 non-voters 

#small1 <- merge(voter1, fullfiletrim, by="state_file_id" , all.x=T)
#dim(small1); dim(voter1); dim(fullfiletrim)
#defvotes <- small1 

#defvotes[, vote2012 := 0] #now figure voting
#defvotes[vh12g1>0 & is.na(vh12g1)==F, vote2012 := 1]
#sum(defvotes$vote2012, na.rm=T) #little under half vote

##and also clean up some other covars
#defvotes[, black:= 0]
#defvotes[def_rac=="B", black:= 1]; summary(defvotes$black)
#defvotes[, male:= 0]
#defvotes[def_sex=="M", male:= 1]; summary(defvotes$male)

##now deidentify and save this dataset. 
#defvotesfull <- copy(defvotes)
#keep <- c("fyear", "voterYOB", "def_sex", "def_rac", "ageatfile", "anyjail", "anyfine", "anyconv", "firstcase_date", "vote2012", "male", "black")
#defvotes <- subset(defvotes, select=keep)

#save(defvotes, file="casetimingset_deidentified.Rdata") #this is dataset in replication package. 


library(data.table)
library(multiwayvcov)
load("casetimingset_deidentified.Rdata") #defvotes
##########################################################################
#different ID strategy: look at people charged right before/after the election

#look every week from 1 week out to a year or so? for all three of these outcomes (just charge, just conv, jail) by race
ed2012 <- as.Date("2012-11-06") #election day
defvotes[,chargebefore:=0]; defvotes[firstcase_date < "2012-11-06" & anyconv==0,chargebefore:=1] 
defvotes[,convbefore:=0]; defvotes[firstcase_date < "2012-11-06" & anyconv==1 & anyjail==0,convbefore:=1] 
defvotes[,jailbefore:=0]; defvotes[firstcase_date < "2012-11-06" & anyconv==1 & anyjail==1,jailbefore:=1] 

black <- defvotes[black==T,]; dim(black)

weeks <- 1*52
storage <- as.data.frame(matrix(NA, nrow=weeks, ncol=13))
for (i in 2:weeks){
	lower <- ed2012 - i*7; higher <- ed2012 + i*7 #add/subtract off days to get the window
	window <- black[firstcase_date < higher & firstcase_date > lower, ]
	ba1 <- lm(vote2012 ~ chargebefore+male, data=window[anyconv==0]); summary(ba1) #est. just-charge model
	#ba1.vcovCL<-cluster.vcov(ba1, window[anyconv==0]$def_spn) #note the clustering is an artifact from a setup for households (where clustering on the individual person did something)
	ba1conv <- lm(vote2012 ~ convbefore, data=window[anyconv==1 & anyjail==0,]); summary(ba1) #just-conv
	#ba1conv.vcovCL<-cluster.vcov(ba1conv, window[anyconv==1& anyjail==0,]$def_spn)
	ba1jail <- lm(vote2012 ~ jailbefore, data=window[anyjail==1,]); summary(ba1) #jail
	#ba1jail.vcovCL<-cluster.vcov(ba1jail, window[anyjail==1,]$def_spn)

	storage[i,1] <- nrow(window) 
	storage[i,2] <- est <- summary(ba1)$coefficients[2,1]
	storage[i,3] <- p <- summary(ba1)$coefficients[2,4]
	err<- summary(ba1)$coefficients[2,2]; dof <- ba1$df
	storage[i,4] <- est + -1*err*qt(0.975, dof) 
	storage[i,5] <- est + 1*err*qt(0.975, dof) 

	storage[i,6] <- est <- summary(ba1conv)$coefficients[2,1]
	storage[i,7] <- p <- summary(ba1conv)$coefficients[2,4]
	err<- summary(ba1conv)$coefficients[2,2]; dof <- ba1conv$df
	storage[i,8] <- est + -1*err*qt(0.975, dof) 
	storage[i,9] <- est + 1*err*qt(0.975, dof) 

	storage[i,10] <- est <- summary(ba1jail)$coefficients[2,1]
	storage[i,11] <- p <- summary(ba1jail)$coefficients[2,4]
	err<- summary(ba1jail)$coefficients[2,2]; dof <- ba1jail$df
	storage[i,12] <- est + -1*err*qt(0.975, dof) 
	storage[i,13] <- est + 1*err*qt(0.975, dof) 
}
colnames(storage) <- c("n", "case_est", "case_p", "case_lowCI", "case_highCI", "conv_est", "conv_p", "conv_lowCI", "conv_highCI", "jail_est", "jail_p", "jail_lowCI", "jail_highCI")
storage$row <- 1:nrow(storage)

pdf("before_after_ownjailblack_windows60.pdf")
plot(storage$row, storage$jail_est, ylim=c(-.3, .1), pch=19, main= "Effect of Own Jail Sentence on Voting (Varying Windows)", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$jail_lowCI, storage$row, storage$jail_highCI, col="dodgerblue4")
dev.off()


white <- defvotes[def_rac=="W",]; dim(white)

weeks <- 1*52
storage <- as.data.frame(matrix(NA, nrow=weeks, ncol=13))
for (i in 2:weeks){
	lower <- ed2012 - i*7; higher <- ed2012 + i*7 #add/subtract off days to get the window
	window <- white[firstcase_date < higher & firstcase_date > lower, ]
	ba1 <- lm(vote2012 ~ chargebefore+male, data=window[anyconv==0]); summary(ba1) #est. just-charge model
	#ba1.vcovCL<-cluster.vcov(ba1, window[anyconv==0]$def_spn) #note the clustering is an artifact from a setup for households (where clustering on the individual person did something)
	ba1conv <- lm(vote2012 ~ convbefore, data=window[anyconv==1 & anyjail==0,]); summary(ba1) #just-conv
	#ba1conv.vcovCL<-cluster.vcov(ba1conv, window[anyconv==1& anyjail==0,]$def_spn)
	ba1jail <- lm(vote2012 ~ jailbefore, data=window[anyjail==1,]); summary(ba1) #jail
	#ba1jail.vcovCL<-cluster.vcov(ba1jail, window[anyjail==1,]$def_spn)

	storage[i,1] <- nrow(window) 
	storage[i,2] <- est <- summary(ba1)$coefficients[2,1]
	storage[i,3] <- p <- summary(ba1)$coefficients[2,4]
	err<- summary(ba1)$coefficients[2,2]; dof <- ba1$df
	storage[i,4] <- est + -1*err*qt(0.975, dof) 
	storage[i,5] <- est + 1*err*qt(0.975, dof) 

	storage[i,6] <- est <- summary(ba1conv)$coefficients[2,1]
	storage[i,7] <- p <- summary(ba1conv)$coefficients[2,4]
	err<- summary(ba1conv)$coefficients[2,2]; dof <- ba1conv$df
	storage[i,8] <- est + -1*err*qt(0.975, dof) 
	storage[i,9] <- est + 1*err*qt(0.975, dof) 

	storage[i,10] <- est <- summary(ba1jail)$coefficients[2,1]
	storage[i,11] <- p <- summary(ba1jail)$coefficients[2,4]
	err<- summary(ba1jail)$coefficients[2,2]; dof <- ba1jail$df
	storage[i,12] <- est + -1*err*qt(0.975, dof) 
	storage[i,13] <- est + 1*err*qt(0.975, dof) 
}
colnames(storage) <- c("n", "case_est", "case_p", "case_lowCI", "case_highCI", "conv_est", "conv_p", "conv_lowCI", "conv_highCI", "jail_est", "jail_p", "jail_lowCI", "jail_highCI")
storage$row <- 1:nrow(storage)

pdf("before_after_ownjailwhite_windows60.pdf")
plot(storage$row, storage$jail_est, ylim=c(-.3, .1), pch=19, main= "Effect of Own Jail Sentence on Voting (Varying Windows)", xlab="Weeks Around Election", ylab="Change in 2012 turnout (percentage points)")
abline(h=0, lty=2, col=gray(.4))
segments(storage$row, storage$jail_lowCI, storage$row, storage$jail_highCI, col="dodgerblue4")
dev.off()


